/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/pci.h>
#include <asm/byteorder.h>
#include <linux/cdev.h>
#include <linux/vmalloc.h>

#include <alga/alga.h>
#include <alga/rng_mng.h>
#include <alga/timing.h>
#include <uapi/alga/pixel_fmts.h>
#include <uapi/alga/amd/dce6/dce6.h>

#include "mc.h"
#include "ih.h"
#include "rlc.h"
#include "fence.h"
#include "ring.h"
#include "dmas.h"
#include "ba.h"
#include "cps.h"
#include "gpu.h"
#include "drv.h"

#include "regs.h"

#include "ba_private.h"

static long ptes_account(struct pci_dev *dev, struct sg_table *sg_tbl,
						int nents, struct ba_map *m)
{
	struct scatterlist *sg;
	int i;

	for_each_sg(sg_tbl->sgl, sg, nents, i) {
		dma_addr_t bus_segment_addr;
		unsigned int bus_segment_sz;

		bus_segment_addr = sg_dma_address(sg);
		bus_segment_sz = sg_dma_len(sg);

		if (!IS_GPU_PAGE_ALIGNED(bus_segment_addr)) {
			dev_err(&dev->dev,"ba:sg_user:trying to map a bus segment not aligned on a gpu page\n");
			return -BA_ERR;
		}

		if (!IS_GPU_PAGE_ALIGNED(bus_segment_sz)) {
			dev_err(&dev->dev,"ba:sg_user:trying to map a bus segment of size not aligned on gpu page size\n");
			return -BA_ERR;
		}
		m->ptes_n += GPU_PAGE_IDX(bus_segment_sz);
	}
	return 0;
}

/* return the updated pte_cpu_addr */
static u64 __iomem *bus_segment_map(struct pci_dev *dev, u64 bus_segment_addr,
				u64 bus_segment_sz, u64 __iomem *pte_cpu_addr)
{
	u64 gpu_ps_n;
	u64 gpu_p;
	u64 bus_addr;

	bus_addr = bus_segment_addr;
	gpu_ps_n = GPU_PAGE_IDX(bus_segment_sz);

	for (gpu_p = 0; gpu_p < gpu_ps_n; ++gpu_p) {	
		*pte_cpu_addr = cpu_to_le64(bus_addr | PTE_VALID | PTE_SYSTEM
				| PTE_SNOOPED | PTE_READABLE | PTE_WRITEABLE);
		++pte_cpu_addr;
		bus_addr += GPU_PAGE_SZ;
	}
	return pte_cpu_addr;
}

static void ptes_cpu_side_update(struct pci_dev *dev, struct ba_map *m,
					struct sg_table *sg_tbl, int nents)
{
	struct dev_drv_data *dd;
	u64 __iomem *pte_cpu_addr;
	struct scatterlist *sg;
	int i;

	dd = pci_get_drvdata(dev);

	/* locate the first pte of this mapping cpu side */
	pte_cpu_addr = dd->ba.pt_map->cpu_addr;
	pte_cpu_addr += GPU_PAGE_IDX(m->gpu_addr - dd->ba.mng.s);

	for_each_sg(sg_tbl->sgl, sg, nents, i) {
		dma_addr_t bus_segment_addr;
		unsigned int bus_segment_sz;

		bus_segment_addr = sg_dma_address(sg);
		bus_segment_sz = sg_dma_len(sg);

		pte_cpu_addr = bus_segment_map(dev, bus_segment_addr,
						bus_segment_sz, pte_cpu_addr);
	}
}

/*
 * ptes update with the dma engines seems to hang something in the GPU. Only
 * a reboot did "unblock" that thing.
 */
static void sg_user_dma_pkt_cpy_blks_wr(struct pci_dev *dev, struct ba_map *m,
			u8 dma, u32 dma_pkt_cpy_blks_n, u64 last_blk_dma_sz)
{
	struct dev_drv_data *dd;
	u64 ptes_start_aperture; /* source */
	u64 ptes_start_vram; /* destination */

	dd = pci_get_drvdata(dev);

	ptes_start_vram = m->ptes_start;
	ptes_start_aperture = dd->ba.pt_map->gpu_addr
			 + GPU_PAGE_IDX(m->gpu_addr - dd->ba.mng.s) * PTE_SZ;

	dma_pkt_cpy_blks_wr(dev, dma, ptes_start_vram, ptes_start_aperture,
					dma_pkt_cpy_blks_n, last_blk_dma_sz);
}

#if 0
#define DMA_FLUSH_VM_DWS_N  9
static void dma_vm_flush_wr(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	dd = pci_get_drvdata(dev);
	/* XXX: why?? */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (VM_CTX_0_PT_BASE_ADDR >> 2));
	dma_wr(dev, dma, GPU_PAGE_IDX(dd->ba.pt_start));

	/* flush hdp cache */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (HDP_MEM_COHERENCY_FLUSH_CTL >> 2));
	dma_wr(dev, dma, 1);

	/* bits 0-15 are the vm contexts 0-15 */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (VM_INVALIDATE_REQ >> 2));
	dma_wr(dev, dma, 1);
}
#endif

#define DMA_FENCE_DWS_N 11
static u32 dma_fence_wr(struct pci_dev *dev, u8 dma)
{
	struct dev_drv_data *dd;
	u64 wb_fence_gpu_addr;
	u32 fence_seq_n;

	dd = pci_get_drvdata(dev);

	/* write the fence, gpu addr must be dw aligned */
	wb_fence_gpu_addr = dd->ba.wb_map->gpu_addr + wb_dma_fence_of[dma];
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_FENCE, 0, 0, 0, 0));
	dma_wr(dev, dma, lower_32_bits(wb_fence_gpu_addr));
	dma_wr(dev, dma, upper_32_bits(wb_fence_gpu_addr));

	fence_seq_n = fence_seq_n_get(&dd->dmas[dma].fence);

	dma_wr(dev, dma, fence_seq_n);

	/* generate an interrupt */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_TRAP, 0, 0, 0, 0));

	/* flush hdp cache */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (HDP_MEM_COHERENCY_FLUSH_CTL >> 2));
	dma_wr(dev, dma, 1);

	/* bits 0-15 are the vm contexts 0-15 */
	dma_wr(dev, dma, DMA_PKT(DMA_PKT_SRBM_WR, 0, 0, 0, 0));
	dma_wr(dev, dma, (0xf << 16) | (VM_INVALIDATE_REQ >> 2));
	dma_wr(dev, dma, 1);
	return fence_seq_n;
}


/*
 * Heuristics: for a 60 Hz refreshed screen, a frame is 16 ms. Let's say to try
 * to hold it unsignaled up to a quarter of this, 4 ms, with force checking
 * every 2 ms (roughly).
 */
#define RING_TIMEOUTS_MAX 2
#define RING_TIMEOUTS_MS 2
#define FENCE_TIMEOUTS_MAX 2
#define FENCE_TIMEOUT_MS 2
static long ptes_gpu_side_update(struct pci_dev *dev, struct ba_map *m)
{
	struct dev_drv_data *dd;
	u8 dma;
	u64 dma_sz;
	u32 dma_pkt_cpy_blks_n;		/* pre-computed */
	u32 last_blk_dma_sz;		/* idem */
	u32 dma_pkt_cpy_blks_dws_n;	/* idem */
	u32 fence_seq_n;
	u32 ring_required_dws_n;
	long r;

	dd = pci_get_drvdata(dev);

	dma_sz = m->ptes_n * PTE_SZ;
	dma_pkt_cpy_blks_n = dma_pkt_cpy_blks_count(dma_sz, &last_blk_dma_sz);
	dma_pkt_cpy_blks_dws_n = dma_pkt_cpy_blks_n * DMA_PKT_CPY_BLK_DWS_N;

	/*
	 * WARNING: there are 3 nents... the original... the one from the
	 * sg table init, then the final one from the actual mapping.
	 * The dma api functions must me called using the nents from the sg
	 * table init. The parsing fo the scatterlist must be done using the
	 * final one.
	 */
	dma_sync_sg_for_device(&dev->dev, dd->ba.pt_map->sg_tbl.sgl,
				dd->ba.pt_map->sg_tbl.nents, DMA_TO_DEVICE);
	dma = dmas_select(dev);

	spin_lock(&dd->dmas[dma].lock);
	ring_required_dws_n = dma_pkt_cpy_blks_dws_n + DMA_FENCE_DWS_N;
	r = ring_wait(&dd->dmas[dma].ring, ring_required_dws_n,
					RING_TIMEOUTS_MAX, RING_TIMEOUTS_MS);
	if (r == RING_WAIT_TIMEOUT) {
		dev_err(&dev->dev, "ba:sg_user_map:%s:ring:unable to book %u dws (timeout)\n",
					dmas_str[dma], dma_pkt_cpy_blks_dws_n
							+ DMA_FENCE_DWS_N);
		return -BA_ERR;
	}
	sg_user_dma_pkt_cpy_blks_wr(dev, m, dma, dma_pkt_cpy_blks_n,
							last_blk_dma_sz);
	fence_seq_n = dma_fence_wr(dev, dma);
	dma_commit(dev, dma);
	spin_unlock(&dd->dmas[dma].lock);

	r = fence_wait(&dd->dmas[dma].fence, fence_seq_n, FENCE_TIMEOUTS_MAX,
							FENCE_TIMEOUT_MS);
	if (r == FENCE_TIMEOUT) {
		dev_err(&dev->dev, "ba:sg_user_map:%s:fence:0x%08x was not signaled in %u ms\n",
				dmas_str[dma], fence_seq_n, FENCE_TIMEOUTS_MAX
							* FENCE_TIMEOUT_MS);
		return -BA_ERR;
	}
	return 0;
}
#undef RING_TIMEOUTS_MAX
#undef RING_TIMEOUTS_MS
#undef FENCE_TIMEOUTS_MAX
#undef FENCE_TIMEOUT_MS
#undef DMA_FENCE_DWS_N

static void dummy_ptes_cpu_side_update(struct pci_dev *dev, struct ba_map *m)
{
	struct dev_drv_data *dd;
	u64 __iomem *pte_cpu_addr;
	u64 ptes_n;
	u64 dummy_pte;

	dd = pci_get_drvdata(dev);

	dummy_pte = dd->ba.dummy_bus_addr | PTE_VALID | PTE_SYSTEM
				| PTE_SNOOPED | PTE_READABLE | PTE_WRITEABLE;

	/* locate the first pte of this mapping cpu side */
	pte_cpu_addr = dd->ba.pt_map->cpu_addr;
	pte_cpu_addr += GPU_PAGE_IDX(m->gpu_addr - dd->ba.mng.s);

	ptes_n = m->ptes_n;
	while (ptes_n--)
		*pte_cpu_addr++ = dummy_pte;
}

long sg_user_map(struct pci_dev *dev, void __iomem *cpu_addr,
					struct sg_table *sg_tbl, int nents)
{
	struct ba_map *m;
	struct dev_drv_data *dd;
	long r;

	m = kzalloc(sizeof(*m), GFP_KERNEL);
	if (m == NULL) {
		dev_err(&dev->dev, "ba:sg_user:unable to allocate memory for map\n");
		goto err;
	}

	m->type = BA_MAP_USER_SG;
	m->ptes_n = 0;
	m->cpu_addr = cpu_addr; /* usually vm_start of its vm_area_struct */

	r = ptes_account(dev, sg_tbl, nents, m);
	if (r == -BA_ERR)
		goto err_free_map;

	dd = pci_get_drvdata(dev);

	/* allocate a range of the aperture */
	down_write(&dd->ba.maps_sem);
	r = rng_alloc_align(&m->gpu_addr, &dd->ba.mng, GPU_PAGE_SZ * m->ptes_n,
								GPU_PAGE_SZ);
	if (r == -ALGA_ERR ) {
		dev_err(&dev->dev, "ba:sg_user:unable to allocate gpu address space\n");
		goto err_maps_sem_unlock;
	}

	m->ptes_start = dd->ba.pt_start
			+ GPU_PAGE_IDX(m->gpu_addr - dd->ba.mng.s) * PTE_SZ;
	ptes_cpu_side_update(dev, m, sg_tbl, nents);
	r = ptes_gpu_side_update(dev, m);
	if (r == -BA_ERR)
		goto err_maps_sem_unlock;

	list_add(&m->n, &dd->ba.maps);

	up_write(&dd->ba.maps_sem);
	return 0;

err_maps_sem_unlock:
	up_write(&dd->ba.maps_sem);

err_free_map:
	kfree(m);
err:
	return -BA_ERR;
}

/* the caller must write hold the ba.maps_sem */
void sg_user_cleanup(struct pci_dev *dev, struct ba_map *m, u8 flgs)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	if (!(flgs & BA_NO_PT_UPDATE)) {
		dummy_ptes_cpu_side_update(dev, m);
		ptes_gpu_side_update(dev, m);
	}
	rng_free(&dd->ba.mng, m->gpu_addr);
}
